import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
df = pd.read_csv('D:\DOCUMENTS AND LECTURES\BS ECO\MetroLiving_Data new.csv')
df.head()
| neighborhood | property_type | units | avg_size | distance_to_public_transport | distance_to_schools | distance_to_park | recent_sale_price | year_built | crime_rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Columbia Heights | Condominium | 21 | 1571.0 | 0.67 | 0.72 | 0.99 | 1739228.0 | 1967 | 5.7 |
| 1 | Petworth | Townhouse | 24 | 1173.0 | 0.17 | 0.80 | 0.83 | NaN | 1990 | 12.1 |
| 2 | U Street Corridor | Single Family Home | 32 | 980.0 | 0.75 | 0.66 | 0.91 | 701071.0 | 2011 | 5.2 |
| 3 | Adams Morgan | Townhouse | 14 | 1241.0 | 0.41 | 0.48 | 0.49 | 1186984.0 | 1968 | 20.7 |
| 4 | Dupont Circle | Townhouse | 11 | 817.0 | 0.63 | 0.45 | 0.58 | 1869701.0 | 2015 | 16.1 |
print(df.isnull().sum())
neighborhood 0 property_type 0 units 0 avg_size 13 distance_to_public_transport 0 distance_to_schools 0 distance_to_park 0 recent_sale_price 238 year_built 0 crime_rate 0 dtype: int64
df.dropna(inplace=True)
print(df.isnull().sum())
neighborhood 0 property_type 0 units 0 avg_size 0 distance_to_public_transport 0 distance_to_schools 0 distance_to_park 0 recent_sale_price 0 year_built 0 crime_rate 0 dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 1750 entries, 0 to 1999 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 neighborhood 1750 non-null object 1 property_type 1750 non-null object 2 units 1750 non-null int64 3 avg_size 1750 non-null float64 4 distance_to_public_transport 1750 non-null float64 5 distance_to_schools 1750 non-null float64 6 distance_to_park 1750 non-null float64 7 recent_sale_price 1750 non-null float64 8 year_built 1750 non-null int64 9 crime_rate 1750 non-null float64 dtypes: float64(6), int64(2), object(2) memory usage: 150.4+ KB
df['proximity_to_public_transport'] = 1 / df['distance_to_public_transport']
df['proximity_to_schools'] = 1 / df['distance_to_schools']
df['proximity_to_park'] = 1 / df['distance_to_park']
df.describe()
| units | avg_size | distance_to_public_transport | distance_to_schools | distance_to_park | recent_sale_price | year_built | crime_rate | proximity_to_public_transport | proximity_to_schools | proximity_to_park | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1750.000000 | 1750.000000 | 1750.000000 | 1750.000000 | 1750.000000 | 1.750000e+03 | 1750.000000 | 1750.000000 | 1750.000000 | 1750.000000 | 1750.000000 |
| mean | 21.557143 | 1205.936000 | 0.495354 | 0.548806 | 0.538429 | 1.259086e+06 | 1985.483429 | 15.047886 | 2.822182 | 2.570763 | 2.603409 |
| std | 11.000436 | 188.354657 | 0.235426 | 0.261646 | 0.257098 | 4.286924e+05 | 20.372082 | 5.774560 | 1.973248 | 1.856316 | 1.865713 |
| min | 3.000000 | 700.000000 | 0.100000 | 0.100000 | 0.100000 | 5.004110e+05 | 1950.000000 | 5.000000 | 1.111111 | 1.000000 | 1.000000 |
| 25% | 12.000000 | 1077.000000 | 0.290000 | 0.330000 | 0.310000 | 8.974245e+05 | 1968.000000 | 9.900000 | 1.408451 | 1.298701 | 1.315789 |
| 50% | 22.000000 | 1206.000000 | 0.490000 | 0.540000 | 0.540000 | 1.246108e+06 | 1986.000000 | 15.100000 | 2.040816 | 1.851852 | 1.851852 |
| 75% | 31.000000 | 1335.750000 | 0.710000 | 0.770000 | 0.760000 | 1.640313e+06 | 2003.750000 | 20.000000 | 3.448276 | 3.030303 | 3.225806 |
| max | 40.000000 | 1873.000000 | 0.900000 | 1.000000 | 1.000000 | 1.996792e+06 | 2020.000000 | 25.000000 | 10.000000 | 10.000000 | 10.000000 |
import seaborn as sns
sns.pairplot(df, x_vars=['avg_size', 'proximity_to_public_transport', 'proximity_to_schools' , 'proximity_to_park' , 'year_built', 'crime_rate'], y_vars=['recent_sale_price'], kind='scatter')
plt.show()
D:\PYTHON - ANACONDA\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
plt.figure(figsize=(8, 6))
sns.lineplot(x='avg_size', y='recent_sale_price', data=df, ci='sd')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\594477631.py:2: FutureWarning: The `ci` parameter is deprecated. Use `errorbar='sd'` for the same effect. sns.lineplot(x='avg_size', y='recent_sale_price', data=df, ci='sd')
plt.figure(figsize=(8, 6))
sns.histplot(data=df, x='avg_size', bins=20, kde=True)
plt.xlabel('Average Size')
plt.ylabel('Frequency')
plt.title('Distribution of Average Size')
plt.show()
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='neighborhood', y='avg_size')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Average Size')
plt.title('Neighborhood vs. Average Size')
plt.show()
plt.figure(figsize=(12, 6))
sns.barplot(data=df, x='neighborhood', y='avg_size')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Average Size')
plt.title('Neighborhood vs. Average Size')
plt.show()
plt.figure(figsize=(10, 6))
sns.lineplot(x='year_built', y='recent_sale_price', data=df)
plt.xlabel('Year Built')
plt.ylabel('Recent Sale Price')
plt.title('Recent Sale Price Trend over Year Built')
plt.show()
plt.figure(figsize=(20, 6))
sns.boxplot(x='year_built', y='recent_sale_price', data=df)
plt.xlabel('Year Built')
plt.ylabel('Recent Sale Price')
plt.title('Recent Sale Price Distribution by Year Built')
plt.xticks(rotation=70)
plt.show()
plt.figure(figsize=(10, 6))
sns.boxplot(x='property_type', y='recent_sale_price', data=df)
plt.xlabel('Property Type')
plt.ylabel('Recent Sale Price')
plt.title('Recent Sale Price Distribution by Property Type')
plt.show()
plt.figure(figsize=(10, 6))
sns.lineplot(x='neighborhood', y='recent_sale_price', data=df)
plt.xlabel('Neighborhood')
plt.ylabel('Recent Sale Price')
plt.title('Trend of Recent Sale Prices by Neighborhood')
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(10, 6))
sns.lineplot(x='neighborhood', y='crime_rate', data=df)
plt.xlabel('Neighborhood')
plt.ylabel('Crime Rate')
plt.title('Crime Rate by Neighborhood')
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(8, 6))
sns.scatterplot(x='crime_rate', y='recent_sale_price', data=df)
plt.xlabel('Crime Rate')
plt.ylabel('Recent Sale Price')
plt.title('Crime Rate vs. Recent Sale Price')
plt.show()
plt.figure(figsize=(8, 6))
sns.lineplot(data=df, x='crime_rate', y='recent_sale_price', estimator='mean', ci=None)
plt.xlabel('Crime Rate')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Crime Rate')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\2750345418.py:2: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.lineplot(data=df, x='crime_rate', y='recent_sale_price', estimator='mean', ci=None)
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='property_type', y='crime_rate')
plt.xlabel('Property Type')
plt.ylabel('Crime Rate')
plt.title('Property Type vs. Crime Rate')
plt.show()
plt.figure(figsize=(12, 6))
sns.boxplot(x='neighborhood', y='recent_sale_price', data=df)
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='neighborhood', y='proximity_to_public_transport')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Distance to proximity Transport')
plt.title('Neighborhood vs. Proximity to Public Transport')
plt.show()
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='neighborhood', y='proximity_to_public_transport', estimator='mean', ci=None)
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Mean Distance to Proximity Transport')
plt.title('Mean Distance to Proximity Transport by Neighborhood')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\2995364177.py:2: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(data=df, x='neighborhood', y='proximity_to_public_transport', estimator='mean', ci=None)
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='neighborhood', y='proximity_to_public_transport', estimator='mean', ci=None)
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Mean Distance to Proximity Transport')
plt.title('Mean Distance to Proximity Transport by Neighborhood')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\1012449902.py:2: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.lineplot(data=df, x='neighborhood', y='proximity_to_public_transport', estimator='mean', ci=None)
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='neighborhood', y='proximity_to_schools')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('proximity to schools')
plt.title('Neighborhood vs. Proximity to schools')
plt.show()
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='neighborhood', y='proximity_to_schools', estimator='mean', ci=None)
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Mean Proximity to Schools')
plt.title('Mean Proximity to Schools by Neighborhood')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\1011597235.py:2: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(data=df, x='neighborhood', y='proximity_to_schools', estimator='mean', ci=None)
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='neighborhood', y='proximity_to_schools', estimator='mean', ci=None)
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Mean Proximity to Schools')
plt.title('Mean Proximity to Schools by Neighborhood')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\1804502330.py:2: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.lineplot(data=df, x='neighborhood', y='proximity_to_schools', estimator='mean', ci=None)
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='neighborhood', y='proximity_to_park')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('proximity to Parks')
plt.title('Neighborhood vs. Proximity to Parks')
plt.show()
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='neighborhood', y='proximity_to_park', estimator='mean', ci=None)
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Mean Proximity to Parks')
plt.title('Mean Proximity to Parks by Neighborhood')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\3761785318.py:2: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(data=df, x='neighborhood', y='proximity_to_park', estimator='mean', ci=None)
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='neighborhood', y='proximity_to_park', estimator='mean', ci=None)
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Mean Proximity to Parks')
plt.title('Mean Proximity to Parks by Neighborhood')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\2242482461.py:2: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.lineplot(data=df, x='neighborhood', y='proximity_to_park', estimator='mean', ci=None)
plt.figure(figsize=(12, 6))
sns.barplot(data=df, x='neighborhood', y='proximity_to_public_transport')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Proximity to public Transport')
plt.title('Neighborhood vs. Proximity to Public Transport')
plt.show()
plt.figure(figsize=(12, 6))
sns.barplot(data=df, x='neighborhood', y='proximity_to_schools')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Proximity to Schools')
plt.title('Neighborhood vs. Proximity to Schools')
plt.show()
plt.figure(figsize=(12, 6))
sns.barplot(data=df, x='neighborhood', y='proximity_to_park')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Proximity to Parks')
plt.title('Neighborhood vs. Proximity to Parks')
plt.show()
plt.figure(figsize=(12, 6))
sns.barplot(data=df, x='neighborhood', y='crime_rate')
plt.xticks(rotation=45)
plt.xlabel('Neighborhood')
plt.ylabel('Crime Rate')
plt.title('Neighborhood vs. Crime Rate')
plt.show()
plt.figure(figsize=(8, 6))
sns.histplot(data=df, x='crime_rate', bins=20, kde=True, hue='recent_sale_price', legend=False)
plt.xlabel('Crime Rate')
plt.ylabel('Frequency')
plt.title('Distribution of Recent Sale Price by Crime Rate')
plt.show()
df['crime_rate_bins'] = pd.cut(df['crime_rate'], bins=10)
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='crime_rate_bins', y='recent_sale_price', ci=None)
plt.xlabel('Crime Rate Bins')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Crime Rate Bins')
plt.xticks(rotation=45)
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\3204610068.py:4: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(data=df, x='crime_rate_bins', y='recent_sale_price', ci=None)
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='crime_rate', y='recent_sale_price', estimator='mean', ci=None)
plt.xlabel('Crime Rate')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Crime Rate')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\3725950289.py:2: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.lineplot(data=df, x='crime_rate', y='recent_sale_price', estimator='mean', ci=None)
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='proximity_to_public_transport', y='recent_sale_price')
plt.xlabel('Proximity to Public Transport')
plt.ylabel('Recent Sale Price')
plt.title('Proximity to Public Transport vs. Recent Sale Price')
plt.show()
df['proximity_bins'] = pd.cut(df['proximity_to_public_transport'], bins=10)
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='proximity_bins', y='recent_sale_price', ci=None)
plt.xlabel('Proximity to Public Transport (Bins)')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Proximity to Public Transport')
plt.xticks(rotation=45)
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\4223314786.py:4: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(data=df, x='proximity_bins', y='recent_sale_price', ci=None)
plt.figure(figsize=(8, 6))
sns.lineplot(data=df, x='proximity_to_public_transport', y='recent_sale_price', estimator='mean', ci=None)
plt.xlabel('Proximity to Public Transport')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Proximity to Public Transport')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\1972856151.py:2: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.lineplot(data=df, x='proximity_to_public_transport', y='recent_sale_price', estimator='mean', ci=None)
df['proximity_bins'] = pd.cut(df['proximity_to_park'], bins=10)
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='proximity_bins', y='recent_sale_price', ci=None)
plt.xlabel('Proximity to Park (Bins)')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Proximity to Park')
plt.xticks(rotation=45)
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\278634783.py:4: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(data=df, x='proximity_bins', y='recent_sale_price', ci=None)
plt.figure(figsize=(8, 6))
sns.lineplot(data=df, x='proximity_to_park', y='recent_sale_price', estimator='mean', ci=None)
plt.xlabel('Proximity to Park')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Proximity to Park')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\314549666.py:2: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.lineplot(data=df, x='proximity_to_park', y='recent_sale_price', estimator='mean', ci=None)
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='proximity_to_schools', y='recent_sale_price')
plt.xlabel('Proximity to schools')
plt.ylabel('Recent Sale Price')
plt.title('Proximity to Schools vs. Recent Sale Price')
plt.show()
df['proximity_bins'] = pd.cut(df['proximity_to_schools'], bins=10)
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='proximity_bins', y='recent_sale_price', ci=None)
plt.xlabel('Proximity to Schools (Bins)')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Proximity to Schools')
plt.xticks(rotation=45)
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\2976156522.py:4: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(data=df, x='proximity_bins', y='recent_sale_price', ci=None)
plt.figure(figsize=(8, 6))
sns.lineplot(data=df, x='proximity_to_schools', y='recent_sale_price', estimator='mean', ci=None)
plt.xlabel('Proximity to Schools')
plt.ylabel('Mean Recent Sale Price')
plt.title('Mean Recent Sale Price by Proximity to Schools')
plt.show()
C:\Users\hassa\AppData\Local\Temp\ipykernel_9556\2817001178.py:2: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.lineplot(data=df, x='proximity_to_schools', y='recent_sale_price', estimator='mean', ci=None)
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='proximity_to_public_transport', y='recent_sale_price')
plt.title('Recent Sale Price trend with Proximity to Public Transport')
plt.xlabel('Proximity to Public Transport')
plt.ylabel('Recent Sale Price')
plt.show()
correlation_matrix = df[['units' , 'crime_rate' , 'avg_size' , 'year_built' , 'proximity_to_public_transport', 'proximity_to_schools', 'proximity_to_park', 'recent_sale_price']].corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()
correlation_matrix = df[['proximity_to_public_transport', 'proximity_to_schools', 'proximity_to_park', 'recent_sale_price']].corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()
import statsmodels.api as sm
X = df[['units', 'avg_size', 'proximity_to_public_transport', 'proximity_to_schools', 'proximity_to_park' , 'year_built', 'crime_rate']]
y = df['recent_sale_price']
X = sm.add_constant(X)
model_ols = sm.OLS(y, X).fit()
print(model_ols.summary())
OLS Regression Results
==============================================================================
Dep. Variable: recent_sale_price R-squared: 0.003
Model: OLS Adj. R-squared: -0.001
Method: Least Squares F-statistic: 0.6738
Date: Thu, 18 Apr 2024 Prob (F-statistic): 0.694
Time: 04:01:17 Log-Likelihood: -25175.
No. Observations: 1750 AIC: 5.037e+04
Df Residuals: 1742 BIC: 5.041e+04
Df Model: 7
Covariance Type: nonrobust
=================================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------------
const 2.469e+06 1e+06 2.461 0.014 5.01e+05 4.44e+06
units 326.9055 933.424 0.350 0.726 -1503.845 2157.656
avg_size -9.7953 54.461 -0.180 0.857 -116.612 97.021
proximity_to_public_transport -6244.0916 5201.767 -1.200 0.230 -1.64e+04 3958.274
proximity_to_schools -2092.6498 5531.818 -0.378 0.705 -1.29e+04 8757.054
proximity_to_park -6076.8969 5510.253 -1.103 0.270 -1.69e+04 4730.510
year_built -597.6282 503.675 -1.187 0.236 -1585.499 390.242
crime_rate 1348.4206 1780.008 0.758 0.449 -2142.756 4839.597
==============================================================================
Omnibus: 1410.383 Durbin-Watson: 2.010
Prob(Omnibus): 0.000 Jarque-Bera (JB): 105.347
Skew: -0.001 Prob(JB): 1.33e-23
Kurtosis: 1.798 Cond. No. 2.27e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.27e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
df['interaction_term'] = df['avg_size'] * df['crime_rate']
X = df[['avg_size', 'crime_rate', 'interaction_term']]
X = sm.add_constant(X)
y = df['recent_sale_price']
model = sm.OLS(y, X).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: recent_sale_price R-squared: 0.001
Model: OLS Adj. R-squared: -0.001
Method: Least Squares F-statistic: 0.3223
Date: Thu, 18 Apr 2024 Prob (F-statistic): 0.809
Time: 04:01:58 Log-Likelihood: -25177.
No. Observations: 1750 AIC: 5.036e+04
Df Residuals: 1746 BIC: 5.038e+04
Df Model: 3
Covariance Type: nonrobust
====================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------
const 1.364e+06 1.81e+05 7.541 0.000 1.01e+06 1.72e+06
avg_size -101.8587 147.344 -0.691 0.489 -390.848 187.131
crime_rate -6299.6834 1.14e+04 -0.554 0.579 -2.86e+04 1.6e+04
interaction_term 6.2083 9.245 0.672 0.502 -11.924 24.341
==============================================================================
Omnibus: 1440.189 Durbin-Watson: 2.003
Prob(Omnibus): 0.000 Jarque-Bera (JB): 105.774
Skew: 0.001 Prob(JB): 1.08e-23
Kurtosis: 1.796 Cond. No. 3.49e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.49e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
df['interaction_term'] = df['avg_size'] * df['units']
X = df[['avg_size', 'units', 'interaction_term']]
X = sm.add_constant(X)
y = df['recent_sale_price']
model = sm.OLS(y, X).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: recent_sale_price R-squared: 0.001
Model: OLS Adj. R-squared: -0.001
Method: Least Squares F-statistic: 0.5358
Date: Thu, 18 Apr 2024 Prob (F-statistic): 0.658
Time: 04:02:11 Log-Likelihood: -25177.
No. Observations: 1750 AIC: 5.036e+04
Df Residuals: 1746 BIC: 5.038e+04
Df Model: 3
Covariance Type: nonrobust
====================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------
const 1.424e+06 1.49e+05 9.557 0.000 1.13e+06 1.72e+06
avg_size -141.3157 121.528 -1.163 0.245 -379.670 97.039
units -7028.1015 6114.061 -1.149 0.251 -1.9e+04 4963.551
interaction_term 6.0498 4.988 1.213 0.225 -3.733 15.832
==============================================================================
Omnibus: 1376.251 Durbin-Watson: 2.004
Prob(Omnibus): 0.000 Jarque-Bera (JB): 104.844
Skew: 0.001 Prob(JB): 1.71e-23
Kurtosis: 1.801 Cond. No. 4.30e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.3e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
df['interaction_term'] = df['avg_size'] * df['proximity_to_public_transport']
X = df[['avg_size', 'proximity_to_public_transport', 'interaction_term']]
X = sm.add_constant(X)
y = df['recent_sale_price']
model = sm.OLS(y, X).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: recent_sale_price R-squared: 0.003
Model: OLS Adj. R-squared: 0.001
Method: Least Squares F-statistic: 1.519
Date: Thu, 18 Apr 2024 Prob (F-statistic): 0.208
Time: 04:02:23 Log-Likelihood: -25175.
No. Observations: 1750 AIC: 5.036e+04
Df Residuals: 1746 BIC: 5.038e+04
Df Model: 3
Covariance Type: nonrobust
=================================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------------
const 1.12e+06 1.16e+05 9.621 0.000 8.92e+05 1.35e+06
avg_size 129.0879 95.263 1.355 0.176 -57.754 315.930
proximity_to_public_transport 5.222e+04 3.33e+04 1.569 0.117 -1.31e+04 1.18e+05
interaction_term -48.2382 27.194 -1.774 0.076 -101.575 5.098
==============================================================================
Omnibus: 1263.443 Durbin-Watson: 2.006
Prob(Omnibus): 0.000 Jarque-Bera (JB): 103.046
Skew: 0.004 Prob(JB): 4.21e-23
Kurtosis: 1.811 Cond. No. 5.06e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.06e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
df['interaction_term'] = df['avg_size'] * df['proximity_to_schools']
X = df[['avg_size', 'proximity_to_schools', 'interaction_term']]
X = sm.add_constant(X)
y = df['recent_sale_price']
model = sm.OLS(y, X).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: recent_sale_price R-squared: 0.000
Model: OLS Adj. R-squared: -0.002
Method: Least Squares F-statistic: 0.06114
Date: Thu, 18 Apr 2024 Prob (F-statistic): 0.980
Time: 04:02:33 Log-Likelihood: -25177.
No. Observations: 1750 AIC: 5.036e+04
Df Residuals: 1746 BIC: 5.038e+04
Df Model: 3
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 1.279e+06 1.13e+05 11.309 0.000 1.06e+06 1.5e+06
avg_size -11.6761 92.370 -0.126 0.899 -192.845 169.492
proximity_to_schools -3006.7583 3.35e+04 -0.090 0.928 -6.87e+04 6.27e+04
interaction_term 0.7093 27.288 0.026 0.979 -52.812 54.230
==============================================================================
Omnibus: 1403.697 Durbin-Watson: 2.005
Prob(Omnibus): 0.000 Jarque-Bera (JB): 105.250
Skew: 0.000 Prob(JB): 1.40e-23
Kurtosis: 1.799 Cond. No. 4.55e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.55e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
df['interaction_term'] = df['avg_size'] * df['proximity_to_park']
X = df[['avg_size', 'proximity_to_park', 'interaction_term']]
X = sm.add_constant(X)
y = df['recent_sale_price']
model = sm.OLS(y, X).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: recent_sale_price R-squared: 0.001
Model: OLS Adj. R-squared: -0.000
Method: Least Squares F-statistic: 0.7377
Date: Thu, 18 Apr 2024 Prob (F-statistic): 0.530
Time: 04:33:05 Log-Likelihood: -25176.
No. Observations: 1750 AIC: 5.036e+04
Df Residuals: 1746 BIC: 5.038e+04
Df Model: 3
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
const 1.19e+06 1.12e+05 10.610 0.000 9.7e+05 1.41e+06
avg_size 69.3588 91.551 0.758 0.449 -110.202 248.920
proximity_to_park 3.054e+04 3.4e+04 0.899 0.369 -3.61e+04 9.72e+04
interaction_term -29.8176 27.696 -1.077 0.282 -84.138 24.503
==============================================================================
Omnibus: 1412.032 Durbin-Watson: 2.006
Prob(Omnibus): 0.000 Jarque-Bera (JB): 105.371
Skew: 0.000 Prob(JB): 1.31e-23
Kurtosis: 1.798 Cond. No. 4.55e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.55e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
print("Data type of y:", y.dtype)
print("Data type of X:")
print(X.dtypes)
Data type of y: float64 Data type of X: const float64 units int64 avg_size float64 proximity_to_public_transport float64 proximity_to_schools float64 proximity_to_park float64 year_built int64 crime_rate float64 neighborhood_Anacostia bool neighborhood_Brookland bool neighborhood_Capitol Hill bool neighborhood_Columbia Heights bool neighborhood_Congress Heights bool neighborhood_Deanwood bool neighborhood_Dupont Circle bool neighborhood_Foggy Bottom bool neighborhood_Georgetown bool neighborhood_Kingman Park bool neighborhood_Logan Circle bool neighborhood_Mount Pleasant bool neighborhood_Petworth bool neighborhood_Shaw bool neighborhood_Southwest Waterfront bool neighborhood_Trinidad bool neighborhood_U Street Corridor bool neighborhood_Van Ness bool neighborhood_Woodley Park bool property_type_Single Family Home bool property_type_Townhouse bool dtype: object
for col in X.columns[X.dtypes == 'bool']:
X[col] = X[col].astype(int)
print(X.dtypes)
X = X.apply(pd.to_numeric, errors='coerce')
print(X.dtypes)
const float64 units int64 avg_size float64 distance_to_public_transport float64 distance_to_schools float64 distance_to_park float64 year_built int64 crime_rate float64 proximity_to_public_transport float64 proximity_to_schools float64 proximity_to_park float64 interaction_term float64 neighborhood_Anacostia bool neighborhood_Brookland bool neighborhood_Capitol Hill bool neighborhood_Columbia Heights bool neighborhood_Congress Heights bool neighborhood_Deanwood bool neighborhood_Dupont Circle bool neighborhood_Foggy Bottom bool neighborhood_Georgetown bool neighborhood_Kingman Park bool neighborhood_Logan Circle bool neighborhood_Mount Pleasant bool neighborhood_Petworth bool neighborhood_Shaw bool neighborhood_Southwest Waterfront bool neighborhood_Trinidad bool neighborhood_U Street Corridor bool neighborhood_Van Ness bool neighborhood_Woodley Park bool property_type_Single Family Home bool property_type_Townhouse bool neighborhood_Adams Morgan bool neighborhood_Anacostia bool neighborhood_Brookland bool neighborhood_Capitol Hill bool neighborhood_Columbia Heights bool neighborhood_Congress Heights bool neighborhood_Deanwood bool neighborhood_Dupont Circle bool neighborhood_Foggy Bottom bool neighborhood_Georgetown bool neighborhood_Kingman Park bool neighborhood_Logan Circle bool neighborhood_Mount Pleasant bool neighborhood_Petworth bool neighborhood_Shaw bool neighborhood_Southwest Waterfront bool neighborhood_Trinidad bool neighborhood_U Street Corridor bool neighborhood_Van Ness bool neighborhood_Woodley Park bool property_type_Condominium bool property_type_Single Family Home bool property_type_Townhouse bool dtype: object const float64 units int64 avg_size float64 distance_to_public_transport float64 distance_to_schools float64 distance_to_park float64 year_built int64 crime_rate float64 proximity_to_public_transport float64 proximity_to_schools float64 proximity_to_park float64 interaction_term float64 neighborhood_Anacostia bool neighborhood_Brookland bool neighborhood_Capitol Hill bool neighborhood_Columbia Heights bool neighborhood_Congress Heights bool neighborhood_Deanwood bool neighborhood_Dupont Circle bool neighborhood_Foggy Bottom bool neighborhood_Georgetown bool neighborhood_Kingman Park bool neighborhood_Logan Circle bool neighborhood_Mount Pleasant bool neighborhood_Petworth bool neighborhood_Shaw bool neighborhood_Southwest Waterfront bool neighborhood_Trinidad bool neighborhood_U Street Corridor bool neighborhood_Van Ness bool neighborhood_Woodley Park bool property_type_Single Family Home bool property_type_Townhouse bool neighborhood_Adams Morgan bool neighborhood_Anacostia bool neighborhood_Brookland bool neighborhood_Capitol Hill bool neighborhood_Columbia Heights bool neighborhood_Congress Heights bool neighborhood_Deanwood bool neighborhood_Dupont Circle bool neighborhood_Foggy Bottom bool neighborhood_Georgetown bool neighborhood_Kingman Park bool neighborhood_Logan Circle bool neighborhood_Mount Pleasant bool neighborhood_Petworth bool neighborhood_Shaw bool neighborhood_Southwest Waterfront bool neighborhood_Trinidad bool neighborhood_U Street Corridor bool neighborhood_Van Ness bool neighborhood_Woodley Park bool property_type_Condominium bool property_type_Single Family Home bool property_type_Townhouse bool dtype: object
X = X.astype(int)
ols_model = sm.OLS(y, X)
ols_results = ols_model.fit()
print(ols_results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: recent_sale_price R-squared: 0.013
Model: OLS Adj. R-squared: -0.005
Method: Least Squares F-statistic: 0.7239
Date: Thu, 18 Apr 2024 Prob (F-statistic): 0.867
Time: 04:32:07 Log-Likelihood: -25166.
No. Observations: 1750 AIC: 5.040e+04
Df Residuals: 1718 BIC: 5.057e+04
Df Model: 31
Covariance Type: nonrobust
=====================================================================================================
coef std err t P>|t| [0.025 0.975]
-----------------------------------------------------------------------------------------------------
const 1.559e+06 6.33e+05 2.463 0.014 3.18e+05 2.8e+06
units 462.2484 939.565 0.492 0.623 -1380.564 2305.060
avg_size 15.6411 78.580 0.199 0.842 -138.482 169.764
distance_to_public_transport 2.649e-06 1.56e-06 1.701 0.089 -4.05e-07 5.7e-06
distance_to_schools -1.2e+04 1.21e+05 -0.100 0.921 -2.48e+05 2.24e+05
distance_to_park -1.227e+04 1.45e+05 -0.084 0.933 -2.97e+05 2.73e+05
year_built -619.6622 506.499 -1.223 0.221 -1613.082 373.758
crime_rate 1118.4326 1795.468 0.623 0.533 -2403.102 4639.967
proximity_to_public_transport -7317.3102 5180.094 -1.413 0.158 -1.75e+04 2842.646
proximity_to_schools -1894.1038 5615.532 -0.337 0.736 -1.29e+04 9119.897
proximity_to_park 7083.3340 2.58e+04 0.275 0.783 -4.34e+04 5.76e+04
interaction_term -11.8283 20.970 -0.564 0.573 -52.958 29.301
neighborhood_Anacostia 7.042e+04 3.76e+04 1.871 0.062 -3402.255 1.44e+05
neighborhood_Brookland 8.946e+04 3.79e+04 2.362 0.018 1.52e+04 1.64e+05
neighborhood_Capitol Hill 9.25e+04 3.85e+04 2.401 0.016 1.69e+04 1.68e+05
neighborhood_Columbia Heights 1.041e+05 3.73e+04 2.794 0.005 3.1e+04 1.77e+05
neighborhood_Congress Heights 8.953e+04 3.67e+04 2.436 0.015 1.74e+04 1.62e+05
neighborhood_Deanwood 6.126e+04 3.86e+04 1.586 0.113 -1.45e+04 1.37e+05
neighborhood_Dupont Circle 9.949e+04 3.84e+04 2.589 0.010 2.41e+04 1.75e+05
neighborhood_Foggy Bottom 7.555e+04 3.92e+04 1.929 0.054 -1265.503 1.52e+05
neighborhood_Georgetown 7.347e+04 3.89e+04 1.890 0.059 -2753.734 1.5e+05
neighborhood_Kingman Park 2.047e+04 3.82e+04 0.535 0.592 -5.45e+04 9.55e+04
neighborhood_Logan Circle 7.073e+04 3.69e+04 1.916 0.056 -1682.959 1.43e+05
neighborhood_Mount Pleasant 9.044e+04 3.73e+04 2.426 0.015 1.73e+04 1.64e+05
neighborhood_Petworth 5.965e+04 3.67e+04 1.627 0.104 -1.23e+04 1.32e+05
neighborhood_Shaw 5.314e+04 3.62e+04 1.470 0.142 -1.78e+04 1.24e+05
neighborhood_Southwest Waterfront 7.864e+04 3.77e+04 2.087 0.037 4730.477 1.53e+05
neighborhood_Trinidad 6.43e+04 3.77e+04 1.706 0.088 -9635.341 1.38e+05
neighborhood_U Street Corridor 3.733e+04 3.93e+04 0.949 0.343 -3.98e+04 1.14e+05
neighborhood_Van Ness 9.577e+04 3.73e+04 2.568 0.010 2.26e+04 1.69e+05
neighborhood_Woodley Park 9.277e+04 3.72e+04 2.491 0.013 1.97e+04 1.66e+05
property_type_Single Family Home 3.937e+05 1.58e+05 2.484 0.013 8.28e+04 7.04e+05
property_type_Townhouse 3.801e+05 1.59e+05 2.396 0.017 6.89e+04 6.91e+05
neighborhood_Adams Morgan 1.404e+05 7.27e+04 1.932 0.053 -2114.381 2.83e+05
neighborhood_Anacostia 7.042e+04 3.76e+04 1.871 0.062 -3402.255 1.44e+05
neighborhood_Brookland 8.946e+04 3.79e+04 2.362 0.018 1.52e+04 1.64e+05
neighborhood_Capitol Hill 9.25e+04 3.85e+04 2.401 0.016 1.69e+04 1.68e+05
neighborhood_Columbia Heights 1.041e+05 3.73e+04 2.794 0.005 3.1e+04 1.77e+05
neighborhood_Congress Heights 8.953e+04 3.67e+04 2.436 0.015 1.74e+04 1.62e+05
neighborhood_Deanwood 6.126e+04 3.86e+04 1.586 0.113 -1.45e+04 1.37e+05
neighborhood_Dupont Circle 9.949e+04 3.84e+04 2.589 0.010 2.41e+04 1.75e+05
neighborhood_Foggy Bottom 7.555e+04 3.92e+04 1.929 0.054 -1265.503 1.52e+05
neighborhood_Georgetown 7.347e+04 3.89e+04 1.890 0.059 -2753.734 1.5e+05
neighborhood_Kingman Park 2.047e+04 3.82e+04 0.535 0.592 -5.45e+04 9.55e+04
neighborhood_Logan Circle 7.073e+04 3.69e+04 1.916 0.056 -1682.959 1.43e+05
neighborhood_Mount Pleasant 9.044e+04 3.73e+04 2.426 0.015 1.73e+04 1.64e+05
neighborhood_Petworth 5.965e+04 3.67e+04 1.627 0.104 -1.23e+04 1.32e+05
neighborhood_Shaw 5.314e+04 3.62e+04 1.470 0.142 -1.78e+04 1.24e+05
neighborhood_Southwest Waterfront 7.864e+04 3.77e+04 2.087 0.037 4730.477 1.53e+05
neighborhood_Trinidad 6.43e+04 3.77e+04 1.706 0.088 -9635.341 1.38e+05
neighborhood_U Street Corridor 3.733e+04 3.93e+04 0.949 0.343 -3.98e+04 1.14e+05
neighborhood_Van Ness 9.577e+04 3.73e+04 2.568 0.010 2.26e+04 1.69e+05
neighborhood_Woodley Park 9.277e+04 3.72e+04 2.491 0.013 1.97e+04 1.66e+05
property_type_Condominium 7.857e+05 3.17e+05 2.482 0.013 1.65e+05 1.41e+06
property_type_Single Family Home 3.937e+05 1.58e+05 2.484 0.013 8.28e+04 7.04e+05
property_type_Townhouse 3.801e+05 1.59e+05 2.396 0.017 6.89e+04 6.91e+05
==============================================================================
Omnibus: 1333.795 Durbin-Watson: 2.003
Prob(Omnibus): 0.000 Jarque-Bera (JB): 104.205
Skew: -0.007 Prob(JB): 2.36e-23
Kurtosis: 1.805 Cond. No. 1.65e+17
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.23e-24. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
from sklearn.linear_model import LinearRegression
X = df[['units', 'avg_size', 'proximity_to_public_transport', 'proximity_to_schools', 'proximity_to_park' , 'year_built', 'crime_rate']]
y = df['recent_sale_price']
model_multiple_linear_regression = LinearRegression()
model_multiple_linear_regression.fit(X, y)
print('Intercept:', model_multiple_linear_regression.intercept_)
print('Coefficients:', model_multiple_linear_regression.coef_)
from sklearn.metrics import r2_score
predictions = model_multiple_linear_regression.predict(X)
r_squared = r2_score(y, predictions)
print('R-squared:', r_squared)
from sklearn.metrics import mean_absolute_error, mean_squared_error
mae = mean_absolute_error(y, predictions)
mse = mean_squared_error(y, predictions)
print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
Intercept: 2468963.6708604773 Coefficients: [ 326.90551452 -9.79526605 -6244.09158944 -2092.64983014 -6076.8968629 -597.62820523 1348.42057411] R-squared: 0.002700395453252291 Mean Absolute Error: 371330.1344120291 Mean Squared Error: 183176186226.81674
from sklearn.neural_network import MLPRegressor
X = df[['units', 'avg_size', 'proximity_to_public_transport', 'proximity_to_schools', 'proximity_to_park' , 'year_built', 'crime_rate']]
y = df['recent_sale_price']
model_neural_network = MLPRegressor()
model_neural_network.fit(X, y)
predictions = model_neural_network.predict(X)
print('Predictions:', predictions[:5])
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(y, predictions)
mse = mean_squared_error(y, predictions)
r2 = r2_score(y, predictions)
print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('R-squared:', r2)
Predictions: [989410.00098055 842980.06261391 902335.02376661 797199.59631309 905213.46671496] Mean Absolute Error: 458895.696446196 Mean Squared Error: 316657661688.8813 R-squared: -0.7240372085702156
D:\PYTHON - ANACONDA\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. warnings.warn(
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
ridge_pipeline = Pipeline([
('preprocessor', preprocessor),
('ridge', Ridge(alpha=1.0))
])
ridge_pipeline.fit(X_train, y_train)
ridge_coefs = ridge_pipeline.named_steps['ridge'].coef_
numeric_feature_names = preprocessor.transformers_[0][2]
categorical_feature_names = ridge_pipeline.named_steps['preprocessor'] \
.named_transformers_['cat'] \
.named_steps['onehot'] \
.get_feature_names_out(input_features=categorical_features)
feature_names = np.concatenate([numeric_feature_names, categorical_feature_names])
coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': ridge_coefs})
print(coefficients)
Feature Coefficient 0 units 4241.587901 1 avg_size 5087.017658 2 distance_to_public_transport -19992.318008 3 distance_to_schools -8438.280193 4 distance_to_park 5562.966475 5 year_built -6420.145866 6 crime_rate 431.052182 7 proximity_to_public_transport -30555.963268 8 proximity_to_schools -14969.563216 9 proximity_to_park -6007.061202 10 neighborhood_Adams Morgan -24176.982979 11 neighborhood_Anacostia -18099.125855 12 neighborhood_Brookland 8338.023897 13 neighborhood_Capitol Hill 33032.630395 14 neighborhood_Columbia Heights 61656.048947 15 neighborhood_Congress Heights 26871.035106 16 neighborhood_Deanwood -9938.949770 17 neighborhood_Dupont Circle 35007.568600 18 neighborhood_Foggy Bottom 10845.000565 19 neighborhood_Georgetown -16154.167371 20 neighborhood_Kingman Park -88380.606271 21 neighborhood_Logan Circle 29051.133741 22 neighborhood_Mount Pleasant 53640.159890 23 neighborhood_Petworth -36925.707000 24 neighborhood_Shaw -28395.486934 25 neighborhood_Southwest Waterfront 20990.691595 26 neighborhood_Trinidad -23594.255931 27 neighborhood_U Street Corridor -90271.855266 28 neighborhood_Van Ness 28253.108643 29 neighborhood_Woodley Park 28251.735996 30 property_type_Condominium 7390.384523 31 property_type_Single Family Home 15741.964156 32 property_type_Townhouse -23132.348679
print("Ridge Regression Coefficients:")
print(ridge_pipeline.named_steps['ridge'].coef_)
print("\nRidge Regression Intercept:")
print(ridge_pipeline.named_steps['ridge'].intercept_)
Ridge Regression Coefficients: [ 4241.58790097 5087.01765842 -19992.31800829 -8438.28019253 5562.9664754 -6420.14586566 431.05218157 -30555.96326826 -14969.56321551 -6007.06120248 -24176.98297856 -18099.12585456 8338.02389688 33032.63039521 61656.04894667 26871.03510551 -9938.94977046 35007.56859982 10845.00056538 -16154.16737124 -88380.6062706 29051.13374132 53640.15989048 -36925.70700003 -28395.48693403 20990.69159521 -23594.25593098 -90271.85526579 28253.10864335 28251.73599636 7390.38452337 15741.96415598 -23132.34867934] Ridge Regression Intercept: 1254497.3264722212
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
y_train_pred = ridge_pipeline.predict(X_train)
y_test_pred = ridge_pipeline.predict(X_test)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
print("Training R-squared:", train_r2)
print("Testing R-squared:", test_r2)
print("\nTraining Mean Squared Error (MSE):", train_mse)
print("Testing Mean Squared Error (MSE):", test_mse)
print("\nTraining Mean Absolute Error (MAE):", train_mae)
print("Testing Mean Absolute Error (MAE):", test_mae)
Training R-squared: 0.01480160617945514 Testing R-squared: -0.019085432492405463 Training Mean Squared Error (MSE): 160531316961.72372 Testing Mean Squared Error (MSE): 159733095775.36615 Training Mean Absolute Error (MAE): 330501.5824404619 Testing Mean Absolute Error (MAE): 332631.973222693
from sklearn.linear_model import Lasso
lasso_pipeline = Pipeline([
('preprocessor', preprocessor),
('lasso', Lasso(alpha=1.0))
])
lasso_pipeline.fit(X_train, y_train)
D:\PYTHON - ANACONDA\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:628: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.439e+12, tolerance: 2.607e+10 model = cd_fast.enet_coordinate_descent(
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler',
StandardScaler())]),
Index(['units', 'avg_size', 'distance_to_public_transport',
'distance_to_schools', 'distance_to_park', 'year_built', 'crime_rate',
'proximity_to_public_transport', 'proximity_to_schools',
'proximity_to_park'],
dtype='object')),
('cat',
Pipeline(steps=[('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['neighborhood', 'property_type'], dtype='object'))])),
('lasso', Lasso())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler',
StandardScaler())]),
Index(['units', 'avg_size', 'distance_to_public_transport',
'distance_to_schools', 'distance_to_park', 'year_built', 'crime_rate',
'proximity_to_public_transport', 'proximity_to_schools',
'proximity_to_park'],
dtype='object')),
('cat',
Pipeline(steps=[('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['neighborhood', 'property_type'], dtype='object'))])),
('lasso', Lasso())])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('scaler', StandardScaler())]),
Index(['units', 'avg_size', 'distance_to_public_transport',
'distance_to_schools', 'distance_to_park', 'year_built', 'crime_rate',
'proximity_to_public_transport', 'proximity_to_schools',
'proximity_to_park'],
dtype='object')),
('cat',
Pipeline(steps=[('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['neighborhood', 'property_type'], dtype='object'))])Index(['units', 'avg_size', 'distance_to_public_transport',
'distance_to_schools', 'distance_to_park', 'year_built', 'crime_rate',
'proximity_to_public_transport', 'proximity_to_schools',
'proximity_to_park'],
dtype='object')StandardScaler()
Index(['neighborhood', 'property_type'], dtype='object')
OneHotEncoder(handle_unknown='ignore')
Lasso()
processed_columns = list(lasso_pipeline.named_steps['preprocessor']
.named_transformers_['cat']
.named_steps['onehot']
.get_feature_names_out(categorical_features))
all_feature_names = list(numeric_features) + processed_columns
lasso_coefficients = lasso_pipeline.named_steps['lasso'].coef_
coefficients_df = pd.DataFrame({'Feature': all_feature_names, 'Coefficient': lasso_coefficients})
print(coefficients_df.sort_values(by='Coefficient', key=abs, ascending=False))
train_preds = lasso_pipeline.predict(X_train)
test_preds = lasso_pipeline.predict(X_test)
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
train_r2 = r2_score(y_train, train_preds)
print("Training R-squared:", train_r2)
test_r2 = r2_score(y_test, test_preds)
print("Testing R-squared:", test_r2)
train_mse = mean_squared_error(y_train, train_preds)
print("Training Mean Squared Error (MSE):", train_mse)
test_mse = mean_squared_error(y_test, test_preds)
print("Testing Mean Squared Error (MSE):", test_mse)
train_mae = mean_absolute_error(y_train, train_preds)
print("Training Mean Absolute Error (MAE):", train_mae)
test_mae = mean_absolute_error(y_test, test_preds)
print("Testing Mean Absolute Error (MAE):", test_mae)
Feature Coefficient 27 neighborhood_U Street Corridor -93494.465789 20 neighborhood_Kingman Park -91339.212019 14 neighborhood_Columbia Heights 60537.678310 22 neighborhood_Mount Pleasant 52370.775665 23 neighborhood_Petworth -39239.571033 17 neighborhood_Dupont Circle 33587.745014 13 neighborhood_Capitol Hill 31645.224161 7 proximity_to_public_transport -30692.676341 24 neighborhood_Shaw -30562.598600 21 neighborhood_Logan Circle 27496.733445 28 neighborhood_Van Ness 26711.399089 29 neighborhood_Woodley Park 26702.848197 10 neighborhood_Adams Morgan -26307.190419 26 neighborhood_Trinidad -25772.325956 15 neighborhood_Congress Heights 25219.638622 31 property_type_Single Family Home 22639.640280 11 neighborhood_Anacostia -20168.851738 2 distance_to_public_transport -20124.552823 25 neighborhood_Southwest Waterfront 19343.145506 19 neighborhood_Georgetown -18280.968585 32 property_type_Townhouse -16313.562083 8 proximity_to_schools -15022.633723 30 property_type_Condominium 14222.921766 16 neighborhood_Deanwood -11941.103919 18 neighborhood_Foggy Bottom 9110.878070 3 distance_to_schools -8496.931232 12 neighborhood_Brookland 6542.955689 5 year_built -6429.707350 9 proximity_to_park -6011.485785 4 distance_to_park 5582.990966 1 avg_size 5081.627157 0 units 4259.733619 6 crime_rate 417.882821 Training R-squared: 0.014803292845458782 Testing R-squared: -0.019340358961843274 Training Mean Squared Error (MSE): 160531042131.07382 Testing Mean Squared Error (MSE): 159773053361.70786 Training Mean Absolute Error (MAE): 330538.84752715856 Testing Mean Absolute Error (MAE): 332704.4666508229
from sklearn.ensemble import RandomForestRegressor
rfr_pipeline = Pipeline([
('preprocessor', preprocessor),
('random_forest', RandomForestRegressor(n_estimators=100, random_state=42))
])
rfr_pipeline.fit(X_train, y_train)
y_pred_rfr = rfr_pipeline.predict(X_test)
processed_columns = list(rfr_pipeline.named_steps['preprocessor']
.named_transformers_['cat']
.named_steps['onehot']
.get_feature_names_out(categorical_features))
all_feature_names = list(numeric_features) + processed_columns
feature_importances = rfr_pipeline.named_steps['random_forest'].feature_importances_
importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': feature_importances})
print(importance_df.sort_values(by='Importance', ascending=False))
mse_rfr = mean_squared_error(y_test, y_pred_rfr)
r2_rfr = r2_score(y_test, y_pred_rfr)
print("Mean Squared Error (MSE) - RFR:", mse_rfr)
print("R-squared - RFR:", r2_rfr)
Feature Importance 1 avg_size 0.130950 6 crime_rate 0.125512 5 year_built 0.107460 0 units 0.096296 9 proximity_to_park 0.061457 3 distance_to_schools 0.061268 8 proximity_to_schools 0.061208 4 distance_to_park 0.061193 2 distance_to_public_transport 0.061118 7 proximity_to_public_transport 0.060920 31 property_type_Single Family Home 0.012846 30 property_type_Condominium 0.012149 21 neighborhood_Logan Circle 0.010906 32 property_type_Townhouse 0.010688 26 neighborhood_Trinidad 0.008974 15 neighborhood_Congress Heights 0.008879 11 neighborhood_Anacostia 0.008799 20 neighborhood_Kingman Park 0.007731 28 neighborhood_Van Ness 0.007354 22 neighborhood_Mount Pleasant 0.007295 23 neighborhood_Petworth 0.006862 24 neighborhood_Shaw 0.006665 14 neighborhood_Columbia Heights 0.006662 12 neighborhood_Brookland 0.006249 27 neighborhood_U Street Corridor 0.006221 29 neighborhood_Woodley Park 0.006033 25 neighborhood_Southwest Waterfront 0.005926 10 neighborhood_Adams Morgan 0.005871 18 neighborhood_Foggy Bottom 0.005533 17 neighborhood_Dupont Circle 0.005456 16 neighborhood_Deanwood 0.005394 19 neighborhood_Georgetown 0.005344 13 neighborhood_Capitol Hill 0.004780 Mean Squared Error (MSE) - RFR: 160201542787.94522 R-squared - RFR: -0.022074090065819618
plt.figure(figsize=(10, 6))
plt.barh(feature_names, feature_importances)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.show()